## Create 100 best subsets, run Cox models in all of these subsets

rm(list = ls())
load("~/Dropbox/Current projects/GDR/Spatial diffusion/Data/Modeling data 2.RData")

dim(countydata6)
colnames(countydata6)


## balance test comparing counties with and without WGTV for all covariates included in 
## our analysis; use ITM_Dresden as measure of WGTV access
WGTV <- countydata6$ITM_Dresden
X <- countydata6[,c(12,13,59,44,21,24,45,46,30:31,28,20,19,32:35,39:40,41:43,78,81)]

X[,1] <- log(X[,1])
colnames(X)[1] <- "log(population_1988)"

X[,4] <- as.numeric(as.logical(X[,4]))


## drop WGTV counties adjacent to non-WGTV counties
omit.counties <- NULL

m <- 1
for(m in 1:nrow(X))	{

	if(WGTV[m] == TRUE) { next }

	temp <- which(adjacency$county == countydata6$county[m])
	temp <- adjacency[temp,]
	temp <- temp[-c(1:2)]
	temp <- temp[-which(is.na(temp))]

	neighbors <- adjacency$county[abs(unlist(temp))]

		k <- 1
		for(k in 1:length(neighbors))	{

			sel <- which(countydata6$county == neighbors[k])
			if(WGTV[sel] == FALSE) { next }

		omit.counties <- c(omit.counties,neighbors[k])

										}
						}

omit.counties <- unique(omit.counties)
## save(omit.counties, file = "~/Dropbox/Current projects/GDR/Dave analysis/Omit counties RR.RData")


sel <- which(is.element(countydata6$county, omit.counties))

## drop neighboring counties from dataset
X.nei <- X[-sel,]
WGTV.nei <- WGTV[-sel]
countydata6.nei <- countydata6[-sel,]
rm(sel,temp,k,m,neighbors,adjacency)


## Propensity scores using probit
ps.out <- glm(WGTV.nei ~ ., data = X.nei, family = binomial(link = "probit"))
summary(ps.out)

e <- fitted(ps.out, type = "response")
range(e)


## plot propensity score distributions
set.seed(123)
plot(seq(from = 0, to = 1, length.out = 100), seq(from = 0, to = 1, length.out = 100),
	type = "n", xlab = "Estimated propensity score", ylab = "",
	cex.axis = 1.3, yaxt = "n", cex.lab = 1.3)

x <- 
c(
min(e[WGTV.nei == TRUE]),
max(e[WGTV.nei == FALSE]),
max(e[WGTV.nei == FALSE]),
min(e[WGTV.nei == TRUE])
)
y <- c(-.2,-.2,1.2,1.2)
polygon(x = x, y = y, col = "#D3D3D37F", border = NA)

points(	x = e[WGTV.nei == TRUE], y = .2 + rnorm(length(e[WGTV.nei == TRUE]), 0, .075),
		col = "#FF000033", pch = 20, cex = 1.75)

points(	x = e[WGTV.nei == FALSE], y = .8 + rnorm(length(e[WGTV.nei == F]), 0, .075),
		col = "#0000FF4C", pch = 20, cex = 1.75)

text(x = .5, y = .15, "WGTV", cex = 1.3)
text(x = .5, y = .85, "no WGTV", cex = 1.3)


## extra information for figure caption
table(e > .95)


## units outside the range of overlap
sel1 <- which(e < min(e[WGTV.nei == TRUE]))
sel2 <- which(e > max(e[WGTV.nei == FALSE]))

dropped.cases <- c(sel1,sel2)
length(dropped.cases)

counties.overlap <- countydata6.nei$county[-dropped.cases]
ps <- data.frame(countydata6.nei$county,e)
out <- list(counties.overlap = counties.overlap, ps = ps)
## save(out, file = "~/Dropbox/Current projects/GDR/Dave analysis/ps RR.RData")


## Exhaustively search for samples with best balance in terms of the minimum
## maximal standardized distance across all covariates
X.lim <- X.nei[-dropped.cases,]
WGTV.lim <- WGTV.nei[-dropped.cases]
e.lim <- e[-dropped.cases]

table(WGTV.lim)

## number of samples to evaluate in millions
choose(34,25)/1000000


##  http://stackoverflow.com/questions/4493287/generating-a-very-large-matrix-of-string-
##  combinations-using-combn-and-bigmemor
".combinadic" <- function(n, r, i) {

  largestV <- function(n, r, i) {
    v <- n                                  # Adjusted for one-based indexing
    while(choose(v,r) >= i) v <- v-1        # Adjusted for one-based indexing
    return(v)
  }

  res <- rep(NA,r)
  for(j in 1:r) {
    res[j] <- largestV(n,r,i)
    i <- i-choose(res[j],r)
    n <- res[j]
    r <- r-1
  }
  res <- res + 1
  return(res)
}


## loop over all possible combinations
b <- 1000
matches <- matrix(NA,1,25)
temp <- rep(NA,24)
treateds <- which(WGTV.lim == TRUE)
controls <- which(WGTV.lim == FALSE)

v <- rep(NA,ncol(X.lim))

o <- 1
for(o in 1:length(v))	{
v[o] <- sqrt(mean(c(var(X.lim[WGTV.lim == TRUE,o]), var(X.lim[WGTV.lim == FALSE,o]))))
						}


m <- 1; k <- 1
for(m in 1:choose(34,25))	{

	sel <- .combinadic(34, 25, m)
		for(k in 1:24)	{
			temp[k] <- mean(X.lim[treateds[sel],k] - X.lim[controls,k]) / v[k]
						}
	temp <- max(abs(temp))

	if(any(temp < b))	{

		matches <- rbind(matches,sel)
		b <- c(b,temp)

		if(length(b) > 100)	{

			s <- order(b)
			matches <- matches[s,][1:100,]
			b <- b[s][1:100]

							}
						}

	if(m %% 1000 == 0)	{ cat(m, min(b), "\n") }

							}

## save.image("~/Dropbox/Current projects/GDR/Dave analysis/exhaustive search RR.Rdata")



load("~/Dropbox/Current projects/GDR/Dave analysis/exhaustive search RR.Rdata")
######
## Create balance table; compare balance to permutation baseline; run baseline Cox model
######
library(Matching)
library(survival)
library(Hmisc)

## set stricter convergence criteria
coxph.control(eps = sqrt(.Machine$double.eps), toler.chol = .Machine$double.eps,
iter.max = 1000, toler.inf = sqrt(.Machine$double.eps), outer.max = 1000)

s <- 100
bal.sample <- array(NA, dim = c(ncol(X.lim),3,s))
cox.res <- matrix(NA,3,s)

m <- 1
for(m in 1:s)	{

	X.match <- X.lim[c(controls,treateds[matches[m,]]),]
	WGTV.match <- WGTV.lim[c(controls,treateds[matches[m,]])]
	matched.counties <- counties.overlap[c(controls,treateds[matches[m,]])]

	set.seed(123)
	a <- 1
	for(a in 1:ncol(X.match))	{

		temp <- balanceUV(	Tr = X.match[,a][WGTV.match == TRUE],
							Co = X.match[,a][WGTV.match == FALSE],
							ks = TRUE, nboots = 10000, paired = FALSE)
		bal.sample[a,2:3,m] <- c(temp$p.value, temp$ks$ks.boot.pvalue)

		## use average variance, following Imbens and Rubin 2013
		bal.sample[a,1,m] <-	(mean(X.match[,a][WGTV.match == TRUE]) - 
					 			 mean(X.match[,a][WGTV.match == FALSE])) /
		sqrt(	(var(X.match[,a][WGTV.match == TRUE]) +
				 var(X.match[,a][WGTV.match == FALSE])) /2)

								}

	## fit baseline Cox model
	sel <- is.element(protest6$county, matched.counties)
	protest.temp <- protest6[sel,]

	cox.out <- coxph(formula = Surv(istart, ifinish, protest) ~
    		itm_dresden2f +

			logpopulation_1988f + 
    		femalef100 +
    		collegef100 +
    		industryf100 +
    		working_agef100 +
    		density_1988f +
    		pop_changef +
    		agriculturef +
    		craftsconstructionf +
    		servicestransportf +
    		unskilledf +
    		skilledf +
    		housing_spacef +
    		bathroomf +
    		interior_toiletf +
    		modern_heatingf +
    		capita_per_mdf + 
    		capita_per_dentistf +
    		nitrogen_oxidesf +
    		sulfur_dioxidef +
    		respirable_dustf +
    		districtcapital2f +
			exits_pop +
			protests53_cities +

			strata(eventnumbnumb),
    		data = protest.temp,
    		method = c("efron"))

	# summary(cox.out)

	cox.res[1,m] <- exp(cox.out$coef[1])
	cox.res[2,m] <- exp(cox.out$coef[1] - sqrt(cox.out$var[1,1]) * qnorm(0.975))
	cox.res[3,m] <- exp(cox.out$coef[1] + sqrt(cox.out$var[1,1]) * qnorm(0.975))

	cat(m,"\n")

				}
## save.image("~/Dropbox/Current projects/GDR/Dave analysis/ALLmodelsTemp RR.RData")



out <- matrix(NA,1,12)
colnames(out) <- c(	"min avg diff","avg avg diff","max avg diff",
					"min max diff","avg max diff","max max diff",
					"min min t","avg min t","max min t",
					"min min KS","avg min KS","max min KS")

out[1] <- min(apply(abs(bal.sample[,1,]),2,mean))
out[2] <- mean(apply(abs(bal.sample[,1,]),2,mean))
out[3] <- max(apply(abs(bal.sample[,1,]),2,mean))

out[4] <- min(apply(abs(bal.sample[,1,]),2,max))
out[5] <- mean(apply(abs(bal.sample[,1,]),2,max))
out[6] <- max(apply(abs(bal.sample[,1,]),2,max))

out[7] <- min(apply(bal.sample[,2,],2,min))
out[8] <- mean(apply(bal.sample[,2,],2,min))
out[9] <- max(apply(bal.sample[,2,],2,min))

out[10] <- min(apply(bal.sample[,3,],2,min))
out[11] <- mean(apply(bal.sample[,3,],2,min))
out[12] <- max(apply(bal.sample[,3,],2,min))

round(out,2)

library(Hmisc)
latex(round(out,2), file = "")
## alternative: use 4 plots showing the whole distributions.



## which is the best balanced sample?
sel <- which(apply(abs(bal.sample[,1,]),2,mean)==min(apply(abs(bal.sample[,1,]),2,mean)))
round(mean(abs(bal.sample[,1,sel])),3)
round(max(abs(bal.sample[,1,sel])),3)

## sort samples in order of decreasing balance
sel <- order(apply(abs(bal.sample[,1,]),2,mean))


## plot of Cox estimates
plot(	1:s,
		seq(from = 0, to = 2, length.out = s),
		type = "n", xaxt = "n",
		ylab = "WGTV effect", cex.lab = 1.3, xlab = "")

abline(h = 1, col = "#3333337F", lwd = 2, lty = 3)

m <- 1
for(m in 1:100)	{
	lines(x = c(m,m), y = c(cox.res[1,sel[m]]-.03, cox.res[2,sel[m]]),
		lwd = 1, col = "#EE82EEB2", lend = 1)
	lines(x = c(m,m), y = c(cox.res[1,sel[m]]+.03, cox.res[3,sel[m]]),
		lwd = 1, col = "#EE82EEB2", lend = 1)
	points(x = m, y = cox.res[1,sel[m]], pch = 20, col = "#EE82EEE5", cex = .75)
				}

mtext(side = 1, at = 0, text = "Best balance", adj = 0, cex = 1.1)
mtext(side = 1, at = 100, text = "Worst balance", adj = 1, cex = 1.1)



## randomization p-value for balance
sel <- c(controls, treateds[matches[sel[1],]])

set.seed(123)
temp.rand <- rep(NA,100000)
temp <- rep(NA,24)
k <- 1; a <- 1
for(k in 1:length(temp.rand))	{

	sel.r <- sample(sel)
	for(a in 1:24)					{
	temp[a] <- (mean(X.lim[,a][sel.r[ 1:25]]) - 
				mean(X.lim[,a][sel.r[26:50]])) /
	sqrt(mean(c(var(X.lim[,a][sel.r[ 1:25]]), var(X.lim[,a][sel.r[26:50]]))))

									}

	temp.rand[k] <- max(abs(temp))

								}

hist(temp.rand)
1/sum(temp.rand > max(abs(bal.sample[,1,1])))




,
